In [1]:
%matplotlib inline
In [2]:
import pandas as pd
import numpy as np
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as sklearnPCA
from ggplot import *
In [3]:
def parse_barcodes(bcfile, bc_id='BC'):
res = {}
with open(bcfile, 'r') as fi:
for line in fi:
fields = line.strip().split(',')
if fields[0].startswith(bc_id):
res[fields[0]] = fields[1]
return res
def parse_exp_config(expfile, bc_dict):
res = []
fieldnames = ['id', 'sample', 'cond', 'barcode', 'size', 'region', 'Qbit', 'conc', 'dilution']
with open(expfile) as fi:
reader = csv.DictReader(fi, fieldnames=fieldnames)
for rec in reader:
if rec['id']:
res.append({
'sample': rec['sample'],
'bc_id': rec['barcode'],
'bc_seq': bc_dict[rec['barcode']],
'temp': int(rec['cond'][:2]),
'bcm': '+' in rec['cond'],
})
return pd.DataFrame.from_records(res)
In [4]:
bc_dict = parse_barcodes('../data/Lexogen_Sense_RNA-Seq.csv')
exp_df = parse_exp_config('../data/2017-03-09_NextSeq.csv', bc_dict)
agg_utr = pd.DataFrame.from_csv('../data/utr.counts.csv')
agg_utr
Out[4]:
In [5]:
def normalize(df, edf, columns=None):
'''
Prepares the UTR dataframe (`df`) for log transformation.
Adds experiment metadata from `edf`.
Adds pseudocounts to `utr_counts` and `UTR_length`.
Normalizes counts to UTR length.
'''
def pseudo_counts(x):
return x + 1 if x == 0 else x
df = df.merge(edf, how='left', on='sample')
if columns is not None:
df = df[columns]
# Add pseudocounts to allow log transform later
df['utr_counts'] = df['utr_counts'].apply(pseudo_counts)
df['UTR_length'] = df['UTR_length'].apply(pseudo_counts)
df['utr_norm'] = df['utr_counts'] / df['UTR_length']
return df
In [6]:
columns = ['gene', 'TSS', 'start', 'end', 'UTR_length',
'utr_counts', 'sample', 'bcm', 'temp']
utr = normalize(agg_utr, exp_df, columns)
utr
Out[6]:
In [7]:
# build expression matrix
X = pd.DataFrame()
samples = []
for sample in set(utr['sample']):
mask = (utr['sample']==sample) & (utr['bcm']==False)
if not utr[mask].empty:
X[sample] = utr[mask]['utr_norm'].values
samples.append(sample)
X_std = StandardScaler().fit_transform(X.values.T)
X_std
Out[7]:
In [8]:
sklearn_pca = sklearnPCA(n_components=10)
Y = sklearn_pca.fit_transform(X_std)
print(Y)
print(sklearn_pca.explained_variance_)
print(sklearn_pca.explained_variance_ratio_)
In [9]:
vdf = pd.DataFrame()
vdf['PC'] = [(i+1) for i,x in enumerate(sklearn_pca.explained_variance_ratio_)]
vdf['var'] = sklearn_pca.explained_variance_ratio_
g = ggplot(vdf, aes(x='PC', y='var')) \
+ geom_point(size=10) \
+ ylab('Explained variance') \
+ ggtitle('Unfiltered -BCM')
print(g)
In [11]:
pca_df = pd.DataFrame()
pca_df['cond'] = ['%doC' % exp_df[exp_df['sample']==sample]['temp'] for sample in samples]
pca_df['PC1'] = Y[:,0]
pca_df['PC2'] = Y[:,1]
pca_df
Out[11]:
In [12]:
g = ggplot(pca_df, aes(x='PC1', y='PC2', color='cond')) \
+ geom_point(size=10) \
+ ggtitle('Unfiltered -BCM')
print(g)
In [13]:
# build expression matrix
X = pd.DataFrame()
samples = []
for sample in set(utr['sample']):
mask = (utr['sample']==sample) & (utr['bcm']==True)
if not utr[mask].empty:
X[sample] = utr[mask]['utr_norm'].values
samples.append(sample)
X_std = StandardScaler().fit_transform(X.values.T)
X_std
Out[13]:
In [14]:
sklearn_pca = sklearnPCA(n_components=10)
Y = sklearn_pca.fit_transform(X_std)
print(Y)
print(sklearn_pca.explained_variance_)
print(sklearn_pca.explained_variance_ratio_)
In [15]:
vdf = pd.DataFrame()
vdf['PC'] = [(i+1) for i,x in enumerate(sklearn_pca.explained_variance_ratio_)]
vdf['var'] = sklearn_pca.explained_variance_ratio_
g = ggplot(vdf, aes(x='PC', y='var')) \
+ geom_point(size=10) \
+ ylab('Explained variance') \
+ ggtitle('Unfiltered +BCM')
print(g)
In [16]:
pca_df = pd.DataFrame()
pca_df['cond'] = ['%doC' % exp_df[exp_df['sample']==sample]['temp'] for sample in samples]
pca_df['PC1'] = Y[:,0]
pca_df['PC2'] = Y[:,1]
pca_df
Out[16]:
In [17]:
g = ggplot(pca_df, aes(x='PC1', y='PC2', color='cond')) \
+ geom_point(size=10) \
+ ggtitle('Unfiltered +BCM')
print(g)
In [ ]: